In [15]:
# Required Packages
import pandas as pd
import numpy as np

import json
import folium

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Los Angeles County Restaurant and Market Violations and Inspections

In this article, we use a dataset contains Environmental Health Violations for Restaurants and Markets in Los Angeles County. This dataset can be found here and here.

Los Angeles County Environmental Health is responsible for checking food violations for all unincorporated areas and 85 of the 88 cities in the County. This dataset does not include Pasadena, Long Beach or Vernon (each has its own city health department).

Loading Datasets

The data that has been processed before.

In [16]:
Violations_df = pd.read_csv('Data/Clean_Violations_df.csv')
Inspections_df = pd.read_csv('Data/Clean_Inspections_df.csv')
Data  = pd.read_csv('Data/Data_Inspections_Violations_df.csv')
Violation_Dictionary_df  = pd.read_csv('Data/Violation_Dictionary_df.csv')
Violations_matrix  = pd.read_csv('Data/Violations_matrix_df.csv')
In [17]:
Data['Facility Zip'] = Data['Facility Zip'].astype(str)
Data['Facility Zip'] = Data['Facility Zip'].apply(lambda x: x[:5])

Geographical Analysis

In [18]:
Facilities_Geographical_df = Data.groupby('Facility Zip').agg(Average_Score=('Score', 'mean'),
                                                              Total_Facilities=('Facility ID', pd.Series.nunique)).reset_index()
Facilities_Geographical_df = Facilities_Geographical_df.rename(columns = {'Average_Score':'Average Score',
                                                                          'Total_Facilities':'Total Facilities'})
Facilities_Geographical_df = Facilities_Geographical_df.dropna()
Facilities_Geographical_df.head().style.hide_index() 
Out[18]:
Facility Zip Average Score Total Facilities
90001 90.6628 275
90002 92.7089 64
90003 90.6792 269
90004 92.4268 263
90005 90.5262 286

Next, we can export a geojson file from here. However, we would like to reduce the size of this file for the sake of having an optimal computation.

In [19]:
# loading the GeoJSON file
with open('Data/LA_ZIP_Codes.geojson', 'r') as jsonFile:
    Zipcode_Data = json.load(jsonFile)

# Creating a list from Zip codes
mylist = Facilities_Geographical_df['Facility Zip'].unique().tolist()

# removing ZIP codes that are not in our dataset
temp = []
for i in range(len(Zipcode_Data['features'])):
    if Zipcode_Data['features'][i]['properties']['name'] in mylist:
        temp.append(Zipcode_Data['features'][i])

# creating a new JSON file
Reduced_Zipcode_Data = dict.fromkeys(['type','features'])
Reduced_Zipcode_Data['type'] = 'FeatureCollection'
Reduced_Zipcode_Data['features'] = temp

del Zipcode_Data, mylist, temp
# save the JSON file
open("Data/reduced_LA_ZIP_Codes.json", "w").write(json.dumps(Reduced_Zipcode_Data,
                                                             sort_keys=True, indent=4, separators=(',', ': ')))
del Reduced_Zipcode_Data
In [20]:
def plot_map(Inp_Column, Text_Legend = '', Inp_Df = Facilities_Geographical_df, Zoom_Level=8):

    # reading of the updated GeoJSON file
    Geographical_Data = r'Data/reduced_LA_ZIP_Codes.json'

    # initiating a Folium map
    m = folium.Map(location = [34.052, -118.243], zoom_start = Zoom_Level)

    # creating a map
    m.choropleth(geo_data = Geographical_Data, fill_opacity = 0.8, line_opacity = 0.2,
        data = Inp_Df, key_on = 'feature.properties.name', columns = ['Facility Zip', Inp_Column],
        fill_color = 'RdYlGn', legend_name = Text_Legend)
    folium.LayerControl().add_to(m)
    
    # Show the map
    return m
In [21]:
plot_map('Average Score', 'The Average Score of Facilities')
Out[21]:
In [22]:
plot_map('Total Facilities','Total Facilities')
Out[22]:

Code Violations by Each Facility

Frist, let's addd the Violation Matrix to our Data

In [23]:
Data=pd.merge(Data, Violations_matrix, on='Facility ID', how='right')

Moreover, previously, we found the following violation code as the most correlated ones.

'F037', 'F040', 'F007', 'F044', 'MF34', 'W007', 'W035', 'MF41', 'MF45', 'W017', 'W021', 'W034', 'MF31', 'W033', 'MF38', 'W032', 'MF15', 'MF08', 'F035', 'F033', 'W044', 'MF36', 'W011', 'W031'

Thus,

In [26]:
Facility_Violations_df = Data.groupby(['Facility Zip','Facility ID']).agg({'F037': np.mean,'F040': np.mean,'F007': np.mean,
                        'F044': np.mean,'MF34': np.mean,'F037': np.mean,'W007': np.mean,'W035': np.mean,'MF41': np.mean,
                        'W017': np.mean,'W021': np.mean,'W034': np.mean,'MF31': np.mean,'W033': np.mean,'MF38': np.mean,
                        'W032': np.mean,'MF15': np.mean,'MF08': np.mean,'F035': np.mean,'W044': np.mean,'MF36': np.mean,
                        'W011': np.mean,'W031': np.mean})
Facility_Violations_df = Facility_Violations_df.groupby(level=0).mean()
Facility_Violations_df.reset_index(inplace=True)
In [27]:
Facility_Violations_df.head().style.hide_index()
Out[27]:
Facility Zip F037 F040 F007 F044 MF34 W007 W035 MF41 W017 W021 W034 MF31 W033 MF38 W032 MF15 MF08 F035 W044 MF36 W011 W031
90001 1.17091 0.876364 0.636364 2.16 0 0 0 0 0 0 0 0 0 0 0 0 0 1.57091 0 0 0 0
90002 0.703125 0.59375 0.40625 1.9375 0 0 0 0 0 0 0 0 0 0 0 0 0 1.20312 0 0 0 0
90003 0.914498 0.933086 0.609665 2.46097 0 0 0 0 0 0 0 0 0 0 0 0 0 1.91078 0 0 0 0
90004 0.726236 0.931559 0.539924 2.05323 0 0 0 0 0 0 0 0 0 0 0 0 0 1.84411 0 0 0 0
90005 1.25524 1.3951 0.611888 2.68182 0 0 0 0 0 0 0 0 0 0 0 0 0 1.96853 0 0 0 0

Here, we only plot the following columns

In [131]:
temp = Facility_Violations_df[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
['F037', 'F040', 'F007', 'F044', 'F035']

Therefore,

In [156]:
vCode = 'F037'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[156]:
In [157]:
vCode = 'F040'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[157]:
In [158]:
vCode = 'F007'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[158]:
In [159]:
vCode = 'F044'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[159]:
In [160]:
vCode = 'F035'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[160]:

Program Elements: Seats

In [38]:
PE_Seats_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Seats']).size().unstack(1)).reset_index()
PE_Seats_Counts.head().style.hide_index()
Out[38]:
Facility Zip 0-10 SQ. FT. 0-1999 SQ. FT. 0-30 SQ. FT. 0-999 SQ. FT. 1-1999 SQ. FT. 1-4999 SQ. FT. 1000-1999 SQ. FT. 10000+ SQ. FT. 151+ SQ. FT. 2000+ SQ. FT. 2000-3999 SQ. FT. 2000-4999 SQ. FT. 2000-5999 SQ. FT. 31-60 SQ. FT. 4000-9999 SQ. FT. 6000+ SQ. FT. Other
90001 nan nan 2638 nan 1479 nan nan nan 154 406 nan nan nan 822 nan nan nan
90002 nan nan 362 nan 379 nan nan nan 23 97 nan nan nan 54 nan nan nan
90003 nan nan 2690 nan 2268 nan nan nan nan 224 nan nan nan 382 nan nan nan
90004 nan nan 1828 nan 496 nan nan nan 104 386 nan nan nan 1424 nan nan nan
90005 nan nan 2272 nan 620 nan nan nan 473 196 nan nan nan 2447 nan nan nan

We would like to plot only the following columns:

In [110]:
temp = PE_Seats_Counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
['0-30 SQ. FT.', '1-1999 SQ. FT.', '151+ SQ. FT.', '2000+ SQ. FT.', '31-60 SQ. FT.']
In [115]:
plot_map('0-30 SQ. FT.', 'Facility Total Count (0-30 SQ. FT.)' ,PE_Seats_Counts)
Out[115]:
In [112]:
plot_map('1-1999 SQ. FT.', 'Facility Total Count (1-1999 SQ. FT.)' ,PE_Seats_Counts)
Out[112]:
In [113]:
plot_map('151+ SQ. FT.', 'Facility Total Count (151+ SQ. FT.)' ,PE_Seats_Counts)
Out[113]:
In [114]:
plot_map('31-60 SQ. FT.', 'Facility Total Count (31-60 SQ. FT.)' ,PE_Seats_Counts)
Out[114]:

Program Elements: Type

In [117]:
PE_type_counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Type']).size().unstack(1)).reset_index()
PE_type_counts.head().style.hide_index()
Out[117]:
Facility Zip Caterer Food Market Retail Food Market Wholesale Food Processing Wholesale Food Vehicle Commissary Food Warehouse Interim Housing FF LIC HTH Care Food FAC Private School Cafeteria Restaurant Swap Meet Prepackaged Food Stand
90001 nan 1885 nan nan nan nan nan nan nan 3663 nan
90002 nan 476 nan nan nan nan nan nan nan 455 nan
90003 nan 2492 nan nan nan nan nan nan nan 3213 nan
90004 nan 856 nan nan nan nan 26 nan nan 4329 nan
90005 nan 816 nan nan nan nan nan nan nan 6864 nan

We only plot the following columns.

In [119]:
temp = PE_type_counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
['Food Market Retail', 'Restaurant']
In [120]:
plot_map('Food Market Retail', 'PE Type: Food Market Retail' ,PE_type_counts)
Out[120]:
In [121]:
plot_map('Restaurant', 'PE Type: Restaurant' ,PE_type_counts)
Out[121]:

Program Elements: Risk

In [124]:
PE_Risk_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Risk']).size().unstack(1)).reset_index()
PE_Risk_Counts.head().style.hide_index()
Out[124]:
Facility Zip High Risk Low Risk Moderate Risk
90001 3641 977 930
90002 422 216 293
90003 2973 1564 1168
90004 3868 452 865
90005 6221 512 947
In [126]:
plot_map('Low Risk', 'PE Risk: Low Risk' ,PE_Risk_Counts)
Out[126]:
In [127]:
plot_map('Moderate Risk', 'PE Risk: Moderate Risk' ,PE_Risk_Counts)
Out[127]:
In [128]:
plot_map('High Risk', 'PE Risk: High Risk' ,PE_Risk_Counts)
Out[128]: